Pool

import os
from epyfun.fs import convert_to_utf8

# Example usage:
#file_path = './data/pooltemp/Outdoor_log_from_20230701_to_20230727.csv'
#encoding = convert_to_utf8(file_path, "interim/pooltemp")
#print(f"The file encoding was: {encoding}")

input_directory = './data/pooltemp'
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_file = os.path.join(input_directory, filename)
        convert_to_utf8(input_file, 'interim/pooltemp/' + filename)
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
Loading configurations from /home/runner/work/strom/strom/pyproject.toml.
Settings changed:
Config value
feedback True
autopandas True
displaycon False
dsn_filename ./connections.ini
%sql duckdb:///:default:
# %sql duckdb:///:memory:
# %sql duckdb:///path/to/file.db

Ingest

%%sql

CREATE OR REPLACE VIEW pooltemp AS
SELECT *
FROM read_csv(
  'interim/pooltemp/*.csv',
  columns = {'time': 'TIMESTAMP', 'temp': 'DOUBLE'},
  decimal_separator = ',',
  delim = '\t',
  filename = True,
  header = True,
  skip = 1
)
;
Success
#%sqlcmd profile -t pooltemp;
#this does not seem to be working, it does not work on saved queries, 
#nor if I create the view, nor if I create the table
%%sql

CREATE OR REPLACE MACRO add_dups_count(_srctbl, _cols) AS TABLE 
(SELECT *, COUNT(*) OVER (PARTITION BY _cols) AS _cnt
FROM pooltemp)
--TODO: figure out how to pass a table name to macros in DuckDB
--FROM _srctbl
;
Success
%%sql

SELECT * FROM add_dups_count(pooltemp, time);
time temp filename _cnt
0 2023-06-14 20:55:00 25.69 interim/pooltemp/Outdoor_log_from_20220101_to_... 1
1 2023-06-14 20:58:40 25.73 interim/pooltemp/Outdoor_log_from_20220101_to_... 1
2 2023-06-14 20:59:00 25.73 interim/pooltemp/Outdoor_log_from_20220101_to_... 1
3 2023-06-14 21:02:10 25.73 interim/pooltemp/Outdoor_log_from_20220101_to_... 1
4 2023-06-14 21:07:30 25.73 interim/pooltemp/Outdoor_log_from_20220101_to_... 1
... ... ... ... ...
296575 2023-08-12 20:27:50 26.08 interim/pooltemp/Outdoor_log_from_20230701_to_... 1
296576 2023-08-12 20:37:50 26.08 interim/pooltemp/Outdoor_log_from_20230701_to_... 1
296577 2023-08-12 20:41:30 26.08 interim/pooltemp/Outdoor_log_from_20230701_to_... 1
296578 2023-08-12 20:45:20 26.08 interim/pooltemp/Outdoor_log_from_20230701_to_... 1
296579 2023-08-12 20:49:50 26.08 interim/pooltemp/Outdoor_log_from_20230701_to_... 1

296580 rows × 4 columns

%%sql

CREATE OR REPLACE MACRO count_dups_by(_srctbl, _cols) AS TABLE
SELECT 
  COUNT(*) AS _tot,
  COUNT(*) FILTER(WHERE _cnt > 1) AS _dups,
  COUNT(DISTINCT _cols) AS _uniq,
  _dups - (_tot - _uniq) AS _duniq,
  _dups / _duniq AS _puniq
FROM add_dups_count(_srctbl, _cols) 
;
Success
%%sql

SELECT * FROM count_dups_by(pooltemp, (time, temp));
_tot _dups _uniq _duniq _puniq
0 296580 42787 270593 16800 2.546845
%%sql --save pooltemp_clean

pooltemp_clean =<< SELECT time, avg(temp) AS temp
FROM pooltemp
GROUP BY time
ORDER BY time
;
time temp
0 2023-06-14 20:52:20 25.63
1 2023-06-14 20:52:30 25.63
2 2023-06-14 20:52:40 25.63
3 2023-06-14 20:52:50 25.63
4 2023-06-14 20:53:00 25.63
... ... ...
267979 2023-08-12 20:50:10 26.08
267980 2023-08-12 20:50:20 26.08
267981 2023-08-12 20:50:30 26.08
267982 2023-08-12 20:50:40 26.08
267983 2023-08-12 20:50:50 26.08

267984 rows × 2 columns

pooltemp_clean
time temp
0 2023-06-14 20:52:20 25.63
1 2023-06-14 20:52:30 25.63
2 2023-06-14 20:52:40 25.63
3 2023-06-14 20:52:50 25.63
4 2023-06-14 20:53:00 25.63
... ... ...
267979 2023-08-12 20:50:10 26.08
267980 2023-08-12 20:50:20 26.08
267981 2023-08-12 20:50:30 26.08
267982 2023-08-12 20:50:40 26.08
267983 2023-08-12 20:50:50 26.08

267984 rows × 2 columns

import plotly.express as px

import pandas as pd
import numpy as np

fig = px.scatter(pooltemp_clean, x="time", y="temp", render_mode='webgl')
fig.update_traces(marker_line=dict(width=1, color='DarkSlateGray'))
fig.show()

%%timeit %%sql SELECT COUNT(DISTINCT (time, temp)), COUNT(*) FROM pooltemp ;

%%timeit %%sql SELECT COUNT() FROM ( SELECT time, COUNT() FROM pooltemp GROUP BY time HAVING COUNT(*) > 1

) ;

%%timeit %%sql –save duplicates_detail

SELECT FROM ( SELECT , COUNT() OVER (PARTITION BY time) AS cnt COUNT() OVER (PARTITION BY time, temp) AS cnt2 FROM pooltemp ) WHERE cnt > 1 – AND cnt <> cnt2 ORDER BY -cnt, time ;

%%sql

SELECT COUNT(*), COUNT(DISTINCT time),
COUNT(DISTINCT (time, temp))
FROM duplicates_detail ;

%%timeit %%sql

SELECT time, avg(temp) AS temp FROM pooltemp GROUP BY time ORDER BY time

%%sql

CREATE OR REPLACE MACRO count_uniq_by(cols) AS TABLE SELECT COUNT(DISTINCT cols) FROM pooltemp